#!/usr/bin/perl

use POSIX;
use strict;

#use Text::CSV;
use Data::Dumper qw(Dumper);
use List::Util qw(min max sum);
use List::MoreUtils qw(uniq firstidx);
use Bio::AlignIO;
use Bio::Align::DNAStatistics;
use Bio::Tools::Run::Phylo::PAML::Codeml;
use Bio::Align::Utilities;
use Bio::TreeIO;
use Statistics::Basic qw(:all nofill);
use Cwd;
use Cwd 'abs_path';
use Getopt::Long;
use experimental 'smartmatch';

use lib './scripts';
use ConservatoryUtils;
use GenomeDatabase;
use MappingDatabase;
use CNSDatabase;
use ConservatoryTree;
use CREOrthologList;


$|=1;

###############################################################################
######### Setup

my $conservatoryDir=abs_path(".");

my $EMPTY_FILE = 0; # The size of an empty MAF file, containing no alignments
my $CLASSIFICATION_CLASSES_NUM=6;  ## The number of phylogeny classification categories in the genome_database file. Currently default is 6 (starts from 0).

my $curDir;
my $tmpDir;
my $locus;
my $referenceGenome;

my $noPositionFilter =0;                 ### Filter CNS based on position (upstream vs downstream)
									
##### Alignment parameters
my $minIdentityFamily= 70;
my $minIdentityGlobal= 70;
my $alignmentSeqOverlapToMerge = 0.8; # In case we have multiple alignments from a single promoter to the same region in the reference genome, we will filter them
									# This parameter defines the minimal overlap to consider two alignments.

### PhyloP parameters and CNS filtering parameters
my $minOrthologsForConservationAnalysis =5; # The smallest number of orthologs to consider conservation analysis
my $minPhyloPscore=2;  # Default minumum phyloP score
my $minCNSLength=8;
my $minBPinCNS=6; # minimum number of basepair to be logged as a CNS alignment

my $CNSMergeLength=6;   # What is the maximum distance between conserved nucleotides to merge into a single CNS

my $minFamilySpeciesForCNS=0.333; # The minimum number of species in the family (out of the total species in the family) that have a CNS to be considered as one

#### CNS coverage and splitting parameters
my $minAlignmentLengthToTestOverlap = 30; ### Alignments of less than this length will be checked to see that they significally overlap with the CNS
my $minCNSCoverageInitial=0.2; ### The minimum coverage of the CNS to be acceptable for an initial deep alignment
my $minCNSCoverageAfterSplit=0.5; ### The minimum coverage of CNS to assign to a sub-CNS after splitting
my $minCNSConservationAfterSplit = 0.4; ### The minimal relative number of bases with significant conservation score to have for a split CNS to be considered

my $minCNSCoverageFinal=0.5; ## The coverage a final CNS must have to be considered a hit to the CNS (after spliting and filtering)

#### Alignment filtering parameters
my @minSpeciesPerConservationLevel= (200,125,100,80,30,10); ## Must have atleast these many alignments to support level assignment

my $minHSPthresholdFamily=2000;
my $minHSPthresholdGlobal=1600;

###### Other parameters
my $keep_tmp = 0;
my $verbose = 0;
my $force=0;  
my $skipMissingGenomes=0;
my $justFamily = 0;
my $justOneOrtholog =0;
my $noGenomeCoordinates=0;

my $help = 0;

########### Reg Seq lengths
my $upstreamLength;
my $downstreamLength;

my %upstreamLengthReference;
my %downstreamLengthReference;

GetOptions ("conservatoryDirectory=s" => \$conservatoryDir,
			"locus=s" => \$locus,
			"reference=s" => \$referenceGenome,
			"min-identity-family=i" => \$minIdentityFamily,
			"min-identity-global=i" => \$minIdentityGlobal,
			"min-align-threshold-family=i" => \$minHSPthresholdFamily,
			"min-align-threshold-global=i" => \$minHSPthresholdGlobal,			
			"min-phylop-score=f" => \$minPhyloPscore,
			"min-deep-cns-coverage" => \$minCNSCoverageInitial,
			"no-genome-coordinates" => \$noGenomeCoordinates,
			"just-one-ortholog" => \$justOneOrtholog,
			"no-position-filter" => \$noPositionFilter,
			"cns-merge-length" => \$CNSMergeLength,
			"just-family" => \$justFamily,
			"keep-tmp" => \$keep_tmp,
			"tmp-dir=s" => \$tmpDir,
			"verbose" => \$verbose,
			"force" => \$force,
			"skip-missing-genomes" => \$skipMissingGenomes,
			"help" => \$help) or die ("Error in command line arguments\n");

if($verbose) { print "Conservatory version 2.0.1\n\n"; }

if($minCNSCoverageInitial <0 || $minCNSCoverageInitial>1) {
	print "\n\nBad parameters: min-deep-cns-coverage must be between 0 and 1.\n\n";
	$help = 1;
}

if( $locus eq "" or $referenceGenome eq "" or $help ) {
	if($locus eq "" or $referenceGenome eq "") {
		print "\n\nMissing parameter: must supply both reference genome and locus.\n\n";
	}

   print "buildConservation --reference <familyName> --locus <Locus>\n\n\n";
   print "\t--reference\tReference genome name (REQUIRED). Genomes must be processed with processGenome --global for the reference genome before running buildConservation.\n";
   print "\t--locus\t\t\tLocus in the reference genome for the family (REQUIRED).\n";
   print "\t--conservatoryDirectory\t\tPath of the main conservatory directory.\n";
   print "\t--force\t\tOverwrite existing CNS file.\n";
   print "\n\t\tAlignment options\n\n";
   print "\t--min-identity-family\t\tMinimum identity for alignments within the family. Default is 70.\n";
   print "\t--min-identity-globa\t\tMinimum identity for alignments of CNS for genomes outside the family. Default is 60.\n";
   print "\t--min-align-threshold-family\t\tMimimum alignment score threshold for within family alignments. Default is 2000.\n";
   print "\t--min-align-threshold-global\t\tMimimum alignment score threshold for global alignments. Default is 1600.\n";
   print "\t--min-phylop-score\t\tMinimum phyloP score to consider as significant CNS (default: 1.5).\n";
   print "\t--min-deep-cns-coverage\t\tMimimum alignment coverage for deep CNS (Expressed as ratio of CNS length: 0-1). Default is 0.2.\n";
   print "\t--min-species-per-conservation-level\t\tSpecies number filter for level conservation. Defines the mimimum number of species a CNS has to be found it to allow a certain conservation level. A string of 6 comma delimited values. Default is 150,100,100,50,5,5.\n";   
   print "\t--min-orf-length\t\tMinimum length (in bp) of ORF to identify in CNS. Deafult is \"auto\" which is automatically calculated based on expected random ORF occurance in data.\n";
   print "\t--atrich-length-filter\t\tLength of AT-rich strech to mask (Default 10).\n";
   print "\t--cns-merge-length\t\tMerge CNSs which distance apart is less than this parameter (Default 5).\n";   
   print "\t--no-position-filter\t\tDo not filter CNS conservation based on relative position to the gene (allow CNSs to move up/down stream to the gene).\n";
   
   print "\t--skip-missing-genomes\t\tIf genome information can't be found, skip and continue analysis (DEFAULT: Terminate with error message),\n";
   print "\t--tmp-dir\t\tSpecify the temporary directory (DEFAULT is alignments/<referenceName>/tmp).\n"; 
   print "\t--keep-tmp\t\tDo not delete all the temporary files produced in the temporary directory.\n";
 
   print "\n\t\tGeneral options\n\n";
   print "\t--verbose\t\tOutput extra messages.\n";
   print "\t--help\t\t\tPrints this message.\n\n";
   
   exit();
}

### Set up directory and file access

my $outputDir = "$conservatoryDir/alignments/$referenceGenome/";
my $outputCNSDir = "$conservatoryDir/CNS/$referenceGenome/";

my $finalFASTAUpFileName = "$outputDir/$locus.up.fasta";
my $finalFASTADownFileName = "$outputDir/$locus.down.fasta";

my $finalCNSFileName = "$outputCNSDir/$locus.cns.csv";
my $finalMapFileName = "$outputCNSDir/$locus.map.csv";
my $finalPhylopBedFileName = "$outputCNSDir/$locus.phylop.bed";

### Alignment, orthologs and CNSs
my %MSAUpstream;
my %MSADownstream;
my %mapAlignmentFamily;

my $CNSDB = new CNSDatabase();
my $mappingDB = new MappingDatabase();

my @phyloPconservationMapUp;
my @phyloPconservationMapDown;

####### First, sanity checks. Check to see if directory structure is OK and if programs are installed
####### 
die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt\n" unless (-e "$conservatoryDir/genomes" && -e "$conservatoryDir/genomes/blastdb/" && -e "$conservatoryDir/scripts" && -e "$conservatoryDir/alignments");

my $t_lastz = `sh -c 'command -v lastz'`;
die "ERROR: Cannot find lastz in path. Please make sure it is installed.\n" unless ($t_lastz ne ""); 

my $t_samtools = `sh -c 'command -v samtools'`;
die "ERROR: Cannot find samtools in path. Please make sure it is installed.\n" unless ($t_samtools ne "");  

my $t_phyloP = `sh -c 'command -v phyloP'`;
die "ERROR: Cannot find phyloP in path. Please make sure phast is installed.\n" unless ($t_phyloP ne "");  

my $t_wig2bed = `sh -c 'command -v wig2bed'`;
die "ERROR: Cannot find wig2bed in path. Please make sure bedops is installed.\n" unless ($t_wig2bed ne "");  

die "ERROR: Cannot find fastML executable at $conservatoryDir/scripts/fastml.\n" unless (-x "$conservatoryDir/scripts/fastml");
###############################################
### Make sure we are not overwriting existing files (unless we are forced to)
if ( (-e $finalCNSFileName) && !$force  ) { die "END: Output CNS file for $locus ($finalCNSFileName) already exists. Use --force to overwrite.\n"; }
if( (-e $finalFASTAUpFileName) && (!$force) && $justFamily) { die "END: Output family alignment file for $locus ($finalFASTAUpFileName) already exists. Use --force to overwrite.\n"; }

###############################################
if($verbose) { print "START: Begin conservatory analysis for $locus.\n"; }

if( (-e $finalCNSFileName) && $force && $verbose) { print "PROGRESS: Overwriting existing file $finalCNSFileName.\n"; }

my $genomeDB = new GenomeDatabase($conservatoryDir);
die "ERROR: Cannot find reference genome $referenceGenome in the genome database file.\n" unless $genomeDB->genomeExists($referenceGenome);
my $referenceGenomeFamily = $genomeDB->genomeToFamily($referenceGenome);
my $refLocusFullName = $genomeDB->getLocusFullName($referenceGenome, $locus);
my $referenceSpecies = $genomeDB->genomeToSpecies($referenceGenome);
## set parameters
$genomeDB->setMinFamilyIdentity($minIdentityFamily);
$genomeDB->setMinFamilyThreshold($minHSPthresholdFamily);

### Setup model and tree
my $treeFileName = "$conservatoryDir/genomes/$referenceGenomeFamily.tree";
my $modelFileName = "$conservatoryDir/genomes/$referenceGenomeFamily.mod";


my $conservatoryTree = new ConservatoryTree($genomeDB);
my $familyTree = new ConservatoryTree($genomeDB, $treeFileName);
if($tmpDir eq "") {
	$tmpDir = $genomeDB->getTemporaryDir();
}

$tmpDir = "$tmpDir/$refLocusFullName/";

$genomeDB->setTemporaryDir($tmpDir);

### Extract the absolute coordinates of the locus

my %referenceAbsoluteCoordiantes = getGeneCoordinates($conservatoryDir,$referenceGenomeFamily, $referenceGenome, $refLocusFullName);
my $refGeneChr = $referenceAbsoluteCoordiantes{'Chr'};
my $refGeneStart = $referenceAbsoluteCoordiantes{'Start'};
my $refGeneEnd = $referenceAbsoluteCoordiantes{'End'};
my $refGeneStrand= $referenceAbsoluteCoordiantes{'Strand'};

die "ERROR: Cannot find locus $refLocusFullName in reference genome $referenceGenome.\n" unless ($refGeneChr ne "");

if($verbose) { print "PROGRESS: Extracted genome coordinates $locus:$refGeneChr:$refGeneStart:$refGeneEnd:$refGeneStrand. Upstream sequence " . $genomeDB->getUpstreamLength($referenceGenome) . " bp. Downstream sequence " . $genomeDB->getUpstreamLength($referenceGenome) . " bp.\n"; }

### make place for the alignment and CNS
if (! -e $outputDir) { mkdir($outputDir); }
if (! -e $tmpDir) { mkdir($tmpDir); }
if (! -e $outputCNSDir) { mkdir($outputCNSDir); }	

#remember where we are
$curDir = getcwd;

chdir($tmpDir);

####### Set up the reference fasta files
my $upstreamRefFastaName = "$tmpDir/$referenceGenome.$locus.up.fasta";
my $downstreamRefFastaName = "$tmpDir/$referenceGenome.$locus.down.fasta";

#### Perform family alignments

my %MSAUpstream, my %MSADownstream;

$MSAUpstream{$referenceSpecies} = extractFastaToFile( $genomeDB->getConservatoryDir() . "/genomes/" . $genomeDB->genomeToFamily($referenceGenome) . "/$referenceGenome" . ".upstream.fasta.gz", $refLocusFullName, $upstreamRefFastaName);
$MSADownstream{$referenceSpecies} = extractFastaToFile($genomeDB->getConservatoryDir() . "/genomes/" . $genomeDB->genomeToFamily($referenceGenome) .  "/$referenceGenome" . ".downstream.fasta.gz", $refLocusFullName, $downstreamRefFastaName);

if($verbose) { print localtime() . ": Done extracting reference sequences.\n";}

# set up CRE-orthology hashes
##############################################################################
## get ortholog sequences
my $orthologList = new CREOrthologList($genomeDB);

foreach my $curGenome ($genomeDB->getGenomeNamesInFamily($referenceGenomeFamily)) {
	my $orthologListForGenome = new CREOrthologList($genomeDB, $referenceGenome, $refLocusFullName, $curGenome, $upstreamRefFastaName, $downstreamRefFastaName, 0, $verbose);
	if(defined $orthologListForGenome) { $orthologList->add($orthologListForGenome); } 
}

if(!$keep_tmp) {
	unlink($upstreamRefFastaName);
	unlink($downstreamRefFastaName);
}

#####################################################################################################
##### Now we have assembled the list of putative CRE orthologs. Select the best genome for each species

$orthologList->pickBestPutativeOrthologs($justOneOrtholog,$verbose);

if ($verbose) { print "PROGRESS: Done with CRE-ortholog selection. Collected " . $orthologList->getOrthologsNumber() . " CRE-orthologs from " . $orthologList->getSpeciesNumber() . " species.\n"; }
####################################################################################################################################
##### At this point, we have selected our CRE orthologs and have all the filtered upstream and downstream sequences in fasta files 
##### Perform alignments to the reference genome
if($orthologList->getSpeciesNumber() > $minOrthologsForConservationAnalysis) {
	### First, record the orthologs
	$orthologList->writeList( "$outputDir/$locus.CREorthologs.txt");

	if($verbose) { print "PROGRESS: Multiple sequence assembly...\n"; }
	foreach my $curOrtholog ($orthologList->getOrthologsByQuality() ) {
		$mappingDB->add(loadMAF($curOrtholog, $genomeDB->getUpstreamLength($referenceGenome), $genomeDB->getUpstreamLength( $curOrtholog->getGenome() ), \%MSAUpstream, "U","U"));
		$mappingDB->add(loadMAF($curOrtholog, $genomeDB->getDownstreamLength($referenceGenome), $genomeDB->getDownstreamLength( $curOrtholog->getGenome() ), \%MSADownstream, "D","D"));
	}

	### Add empty sequences for missing species
	foreach my $curSpecies ($genomeDB->getSpeciesForFamily($referenceGenomeFamily)) {
		if(!defined $MSAUpstream{$curSpecies}) {
			$MSAUpstream{$curSpecies} = 'N' x $genomeDB->getUpstreamLength($referenceGenome);
		}
		if(!defined $MSADownstream{$curSpecies}) {
			$MSADownstream{$curSpecies} = 'N' x $genomeDB->getDownstreamLength($referenceGenome);
		}
	}
	### and write the MSA fasta file
	writeMSAToFasta(\%MSAUpstream, $finalFASTAUpFileName);
	writeMSAToFasta(\%MSADownstream, $finalFASTADownFileName);	
	$orthologList->removeTemporaryFiles();

	### Get CNS from alignments
	if($verbose) { print "PROGRESS: Begin family conservation analysis for $locus. Model file: $modelFileName. Minimum phyloP score: $minPhyloPscore.\n"; }

	my $phyloPParameters = "--seed 123 --wig-scores --no-prune --method SCORE --mode CON";
	my $CNSFound=0;
	my @CNSCoordinates;
	my @rawUpstreamCoords, my @rawDownstreamCoords;

	if(-s $finalFASTAUpFileName >0) {
		open(my $upstreamPhylopBedFile, "phyloP $phyloPParameters $modelFileName $finalFASTAUpFileName | wig2bed | sort -k1,1 -k2,2n |");
		@rawUpstreamCoords = bedFileToCoordinateList($upstreamPhylopBedFile, -($genomeDB->getUpstreamLength($referenceGenome)));
		close ($upstreamPhylopBedFile);

		@phyloPconservationMapUp = coordinateListToMap(\@rawUpstreamCoords, $genomeDB->getUpstreamLength($referenceGenome), $genomeDB->getUpstreamLength($referenceGenome));
		push(@CNSCoordinates, mergeCoordinates(\@rawUpstreamCoords, $minPhyloPscore, $CNSMergeLength, $minCNSLength));
	}

	if(-s $finalFASTADownFileName >0) {
		open(my $downstreamPhylopBedFile, "phyloP $phyloPParameters $modelFileName $finalFASTADownFileName | wig2bed | sort -k1,1 -k2,2n |");
		@rawDownstreamCoords = bedFileToCoordinateList($downstreamPhylopBedFile, 0);
		close ($downstreamPhylopBedFile);
		@phyloPconservationMapDown = coordinateListToMap(\@rawDownstreamCoords, $genomeDB->getDownstreamLength($referenceGenome) , 0);
		push(@CNSCoordinates, mergeCoordinates(\@rawDownstreamCoords, $minPhyloPscore, $CNSMergeLength, $minCNSLength));
	}
	##### translate the raw coordinates and output a bed file
	push(@rawUpstreamCoords, @rawDownstreamCoords);
	my @absRawCoords = translateRealtiveToAbsoluteCoordinates(\@rawUpstreamCoords, $refGeneStart, $refGeneEnd, $refGeneStrand,"+");
	my @absoluteCoordinates = translateRealtiveToAbsoluteCoordinates(\@CNSCoordinates, $refGeneStart, $refGeneEnd, $refGeneStrand,"+");

	### Now dump absolute coordinate

	open(my $finalPhylopBedFile, ">", $finalPhylopBedFileName);
	foreach my $coord (@absRawCoords) {
		print $finalPhylopBedFile join("\t", ($refGeneChr, $coord->{'Start'}, ($coord->{'Start'} + $coord->{'Len'}+1),  $coord->{'pValue'}  )) . "\n";
	}
	close($finalPhylopBedFile);

	if($verbose) { print "PROGRESS: End conservation analysis for $locus.\n"; }

	if((scalar @CNSCoordinates) == 0) {
		if($verbose) { print ("END: No CNS found for $locus.\n"); } 
		exit(0);
	}

	## Now build the CNS Database
	foreach my $CNSCoordinate (@CNSCoordinates) {
		my $positionInReference = $CNSCoordinate->{'Start'} ;

		my $CNSLength = $CNSCoordinate->{'Len'};
		my $CNSID = "$locus.$positionInReference";
		if($verbose) { print("PROGRESS: Creating $CNSID ($CNSLength)..."); }
		my $newCNS = new CNS($referenceGenome, $CNSID, $locus, $positionInReference, $CNSLength);
		# Add mapping to reference
		my $refSeq;
		if($positionInReference<0) {
			$refSeq = substr($MSAUpstream{$referenceSpecies}, $positionInReference + $genomeDB->getUpstreamLength($referenceGenome), $CNSLength );
		} else {
			$refSeq = substr($MSADownstream{$referenceSpecies}, max(0,$positionInReference-1), $CNSLength );
		}
		my $referenceMapping = new Mapping($newCNS->getID(), $referenceSpecies, $refLocusFullName, $positionInReference, "+", 0, $refSeq,$refSeq );
		$CNSDB->add($newCNS);
		$mappingDB->add($referenceMapping);
		my %cnsSeqs;
		$cnsSeqs{$referenceSpecies} = $referenceMapping->getCNSSeq();
		### Assign mappings to CNS
		my $flag=0;
		foreach my $curMapping (@{ $mappingDB->getMappingsForCNS($refLocusFullName)}) {
			if($curMapping->overlap($newCNS, 0 , $minBPinCNS) > $minCNSCoverageInitial ) {
				my $mappingForCNS = $curMapping->createSubSetMapping($positionInReference, $positionInReference+$CNSLength);
				if($mappingForCNS->getSeqBP() >= $minBPinCNS && $mappingForCNS->identity() > $minIdentityFamily) {
					$mappingForCNS->setCNSID($CNSID);
					$mappingForCNS->setRRP( $mappingForCNS->getRRP() - $newCNS->getPos());
					$mappingDB->add($mappingForCNS);

					if(!defined $cnsSeqs{$curMapping->getSpecies()}) {
						$cnsSeqs{$curMapping->getSpecies()} = ("-" x $mappingForCNS->getRRP()) . $mappingForCNS->getCNSSeq();
						if(length($cnsSeqs{$curMapping->getSpecies()} ) < $CNSLength) {
							$cnsSeqs{$curMapping->getSpecies()} = $cnsSeqs{$curMapping->getSpecies()} . ("-" x ($CNSLength - length($cnsSeqs{$curMapping->getSpecies()}) ));
						}
					} else {
						$cnsSeqs{$curMapping->getSpecies()} = mergeFastaSequences($cnsSeqs{$curMapping->getSpecies()}, $mappingForCNS->getCNSSeq(), $mappingForCNS->getRRP());
					}
				}
			}
		}

		if(scalar %cnsSeqs >= $minSpeciesForCNS) {
			delete $cnsSeqs{'Paralogs'};
			$newCNS->setSeq($conservatoryTree->getReconstructedSequence($newCNS, \%cnsSeqs,0));
			if($verbose) { print $newCNS->getSeq() . "\n"; }
		} else {
			$mappingDB->deleteCNS($newCNS);
			$CNSDB->deleteCNS($newCNS);
			if($verbose) { print "Dropped.\n"; }
		}
	}
	$CNSDB->updateNumberOfSupportingSpecies($mappingDB);	

	### Make fasta file for the reconstructed CNS
	my $ancesteralLocusCNSFileName = "$tmpDir/$locus.ancesteral.cns.fasta";
	open (my $ancesteralLocusCNSFasta, ">$ancesteralLocusCNSFileName");	
	foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
		###### Mask ancesteral sequence for homopolymers
		my $ancesteralCNSMasked = $curCNS->getSeq();
		## First, lastz does not handle ambigious characters very well (basically assuming N)
		# replace W with A. Not ideal, but better than alternative. Also, too many A's will be masked.
		$ancesteralCNSMasked =~ s/W/A/g;
		$ancesteralCNSMasked = fuzzyFilterHomopolymers($ancesteralCNSMasked,$homoPolymerFilterLength,0);
		$ancesteralCNSMasked = fuzzyFilterDimers($ancesteralCNSMasked, $dimerPolymerFilterLength,0);
		$ancesteralCNSMasked = fuzzyFilterHomopolymers($ancesteralCNSMasked,$homoPolymerFuzzyFilterLength,1);
		print $ancesteralLocusCNSFasta ">" . $curCNS->getID() . "\n$ancesteralCNSMasked\n";
	}
	close($ancesteralLocusCNSFasta);
	if($verbose) { print "PROGRESS: End ancesteral sequence reconstruction for $locus. Found " . $CNSDB->getNumberOfCNSs() . " CNS.\n"; }

	my $deepOrthologList = new CREOrthologList($genomeDB);

	if(!$justFamily) {
		### Now search for deep CNS
		foreach my $curGenome ($genomeDB->getGenomeNames()) {
			if($genomeDB->genomeToFamily($curGenome) ne $referenceGenomeFamily) {
				my $orthologListForGenome = new CREOrthologList($genomeDB, $referenceGenome, $refLocusFullName, $curGenome, $ancesteralLocusCNSFileName, $ancesteralLocusCNSFileName, 1, $verbose);
				if(defined $orthologListForGenome) { $deepOrthologList->add($orthologListForGenome); } 
			}
		}
	}
	unlink($ancesteralLocusCNSFileName);
	$deepOrthologList->pickBestPutativeOrthologs();
	if($verbose) { print "PROGRESS: Found " . $deepOrthologList->getOrthologsNumber() . " deep orthologs.\n"; }
	### Assign deep mappings to CNS

	foreach my $curOrtholog ($deepOrthologList->getOrthologsByQuality() ) {
		$mappingDB->add(loadMAFforCNSs($genomeDB, $CNSDB, $curOrtholog));
	}

	## remove the mappings not assigned to a CNS
	$mappingDB->deleteCNS($refLocusFullName);

	## Breaking CNS to sub-CNSs
	foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
		my @CNSBreakpoints = $mappingDB->findBreakPointsForCNS($curCNS);

		if(@CNSBreakpoints > 2) {
			if($verbose) { print "PROGRESS: Breaking CNS " . $curCNS->getID() . ". Breakpoints: " . join(",", @CNSBreakpoints) . "...\n"; }
			my @assignedMappings = $mappingDB->assignMappingsToBreakpoints($curCNS, \@CNSBreakpoints);
			for my $curBreakpoint (0.. ((scalar @CNSBreakpoints) - 2)) {
				my $breakLen = $CNSBreakpoints[$curBreakpoint+1]- $CNSBreakpoints[$curBreakpoint];
				my $CNSforBreakpointName = $curCNS->getID() . ".$CNSBreakpoints[$curBreakpoint]";

				### Create new sub-CNS
				my $CNSforBreakpoint = new CNS($curCNS->getRefGenome(), $CNSforBreakpointName, $curCNS->getLocus(), $curCNS->getPos() + $CNSBreakpoints[$curBreakpoint], $breakLen, $curCNS->getConservationLevel(),
											   $curCNS->getSupportingSpeciesNumber(), substr($curCNS->getSeq(),$CNSBreakpoints[$curBreakpoint], $breakLen), $curCNS->getRegulatorySeqPart());
				$CNSDB->add($CNSforBreakpoint);

				## Assign the mappings to the new CNS
				foreach my $curMapping (@assignedMappings) {
					### If there is an overlap, assign the deep CNS to the new subCNS.
					if($curMapping->getBreakPoint() == $curBreakpoint) {
						$curMapping->setRRP( $curMapping->getRRP() - $CNSBreakpoints[$curBreakpoint]);
						$curMapping->setCNSID($CNSforBreakpointName);		
						$mappingDB->add($curMapping);
					}
				}
				### Filtering the sub CNS
				if(scalar $mappingDB->getSpeciesForCNS($curCNS) < $minSpeciesForCNS) {  ## make sure we have enough species
					$CNSDB->deleteCNS($CNSforBreakpoint);
					$mappingDB->deleteCNS($CNSforBreakpoint);
				} else { ## see we have enough conserved sequence
					my $numberOfConservedBasesInCNS=0;
					if($CNSforBreakpoint->getRegulatorySeqPart() eq "U") {
						my $startInPhyloPMap = $CNSforBreakpoint->getPos()+$genomeDB->getUpstreamLength($referenceGenome);
						foreach my $curBase (@phyloPconservationMapUp[ $startInPhyloPMap .. ($startInPhyloPMap+ $CNSforBreakpoint->getLen()) ]) { if($curBase>= $minPhyloPscore) { $numberOfConservedBasesInCNS++; }  }
					} else {
						my $startInPhyloPMap = $CNSforBreakpoint->getPos();
						foreach my $curBase ( @phyloPconservationMapDown[ ($startInPhyloPMap)..($startInPhyloPMap+ $CNSforBreakpoint->getLen()) ] )  { if($curBase>= $minPhyloPscore) { $numberOfConservedBasesInCNS++; } }
					}
					if($numberOfConservedBasesInCNS / $CNSforBreakpoint->getLen() < $minCNSConservationAfterSplit) {
						if($verbose) { print "PROGRESS: Removing " . $CNSforBreakpoint->getID() . " because number of conserved bases is $numberOfConservedBasesInCNS out of ". $CNSforBreakpoint->getLen() .".\n"; }
						$mappingDB->deleteCNS($CNSforBreakpoint);
						$CNSDB->deleteCNS($CNSforBreakpoint);
					}
				}
			}
			### Delete the original CNS
			$CNSDB->deleteCNS($curCNS);
			$mappingDB->deleteCNS($curCNS);
		}
	}

	### Apply filters. Filter low overlap mappings
	foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
		foreach  my $curMapping (@{ $mappingDB->getMappingsForCNS($curCNS)}) {
			if($curMapping->getLen() / $curCNS->getLen() < $minCNSCoverageFinal) {
				$mappingDB->deleteMapping($curMapping);
			}
		}
	}
	##### Filter CNS that are too distant without sufficient support
	$CNSDB->updateNumberOfSupportingSpecies($mappingDB);

	foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
  		my @CNSConservationLevel= ( (0) x $CLASSIFICATION_CLASSES_NUM );	 
		my @referenceClassification = split /-/, $genomeDB->getClassification($referenceGenome);
  	    #### Now calculate conservation levels
  		foreach my $curGenome ($mappingDB->getGenomesForCNS($curCNS)) {
  	  		my @classification = split /-/ , $genomeDB->getClassification($curGenome);
    		foreach my $curClassificationLevel (0..($CLASSIFICATION_CLASSES_NUM-1)) {	
    	 		if($referenceClassification[$curClassificationLevel] ne $classification[$curClassificationLevel]) {
    	 			$CNSConservationLevel[$curClassificationLevel]++;
				}
    	 	}
		}

		foreach my $levelToCheck ( 1..($CLASSIFICATION_CLASSES_NUM-1)) {
   	  		if($curCNS->getSupportingSpeciesNumber() < $minSpeciesPerConservationLevel[$levelToCheck] ||
			($CNSConservationLevel[$levelToCheck] == $CNSConservationLevel[$levelToCheck+1] && $CNSConservationLevel[$levelToCheck]==1)) {

  	  	  		### Remove the misalignments
				foreach my $curMapping (@{ $mappingDB->getMappingsForCNS($curCNS) }) {
		  	  		my @classification = split /-/ , $genomeDB->getClassification($curMapping->getGenome());
		  	  		if($classification[$levelToCheck] ne $referenceClassification[$levelToCheck]) {
  	 	  	  			if($verbose) { print "PROGRESS: Removing alignment of " . $curMapping->getLocus() . " for " . $curMapping->getCNSID() . " at level $levelToCheck. because $classification[$levelToCheck] ne $referenceClassification[$levelToCheck]. (SpNum:" . $curCNS->getSupportingSpeciesNumber() .")\n"; }
						$mappingDB->deleteMapping($curMapping);
					}
				}
		  	}
 	  	}
	}

	$CNSDB->updateNumberOfSupportingSpecies($mappingDB);
	foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
		my @CNSSpecies = $mappingDB->getSpeciesForCNS($curCNS);
		if(scalar @CNSSpecies >= $minSpeciesForCNS) {
			$curCNS->setConservationLevel($conservatoryTree->findDeepestCommonNode(\@CNSSpecies ));
		} else {
			$mappingDB->deleteCNS($curCNS);
			$CNSDB->deleteCNS($curCNS);
		}
	}
	if(!$noGenomeCoordinates) { 
		if($verbose) { print "PROGRESS: Updating absolute coordinates..."; }
		$mappingDB->updateAbsoluteCoordinates($genomeDB, $verbose);
		if($verbose) { print "Done\n"; }		
	}

	if($verbose) { print "PROGRESS: Writing output files..."; }
	$CNSDB->writeDatabase($finalCNSFileName);
	$mappingDB->writeDatabase($finalMapFileName, $CNSDB);
	if($verbose) { print "Done\n"; }	
}


#### Go back to the original directory, and we are done!
chdir $curDir;	

rmdir($tmpDir);
if($verbose) { print "DONE: End $referenceGenome:$locus.\n";	}

###############################################################################################################################
###############################################################################################################################
###############################################################################################################################
#####################################                END                     ##################################################
###############################################################################################################################
###############################################################################################################################
###############################################################################################################################



###########################################################################################
################### Coordinate manipulation routines

######### Load bed file into a coordiante array

sub bedFileToCoordinateList {
	my ($bedFile, $coordinateShift) = @_;
	my @coordinateList;

	while(<$bedFile>) {
		my ($locus,$start,$end,$id,$pvalue) = split;
		push(@coordinateList, { 'Start' => $start + $coordinateShift,
								'Len' => $end - $start,
								'pValue' => $pvalue})
	}
	return @coordinateList;
}
#############################################################################################

sub writeMSAToFasta {
	my ($MSARef, $fastaFileName) = @_;
	## First, find out order
	my %seqContent;
	foreach my $curSeqName (keys %$MSARef) {
		$MSARef->{$curSeqName}  =~ s/X/N/g;
		$seqContent{$curSeqName} =  length($MSARef->{$curSeqName}) - ($MSARef->{$curSeqName} =~ tr/N//) - ($MSARef->{$curSeqName} =~ tr/-//);
	}
	my @seqOrder = reverse sort { $seqContent{$a} <=> $seqContent{$b} } keys %seqContent;
	open(my $fastaFile, ">$fastaFileName");
	foreach my $curSeqName (@seqOrder) {
		print $fastaFile ">$curSeqName\n" . $MSARef->{$curSeqName} . "\n";
	}
	close($fastaFile);
}

########################################
###### Merge coordinates
###### Accepts a reference to an array of coordiantes, pvalue cutoff, the window to merge coordiantes, minimum region length and a constant number to add to the coordinate

sub mergeCoordinates {
	my ($coordianteListRef, $pvalueCutoff, $mergeLength, $minRegionLength) = @_;
	
	### Sort the coordinate array
	my @coordinateList = sort { $a->{'Start'} <=> $b->{'Start'} } @$coordianteListRef;

	my @mergedCoordinates;
	my $currentRegionStart= $coordinateList[0]->{'Start'};
	my $currentRegionLength= $coordinateList[0]->{'Len'};
	my $currentRegionPvalue= $coordinateList[0]->{'pValue'};

	foreach my $coord (@coordinateList) {

		if($coord->{'pValue'}>=$pvalueCutoff) {
			if($currentRegionLength + $currentRegionStart + $mergeLength >= $coord->{'Start'}) {  ## if this is closer than mergeLength to the current region, merge it
				$currentRegionLength = $coord->{'Start'} - $currentRegionStart +1;
				$currentRegionPvalue += $coord->{'pValue'};
			} else { ## if not, start a new region
				if($currentRegionLength >= $minRegionLength) {  ### if it is long enough, log it
					push(@mergedCoordinates, { 'Start' => $currentRegionStart,
											'Len' => $currentRegionLength,
											'pValue' => $currentRegionPvalue/$currentRegionLength})
				}
				$currentRegionStart = $coord->{'Start'};
				$currentRegionLength=1;
				$currentRegionPvalue= $coord->{'pValue'};
			}
		}
	}

	if($currentRegionLength >= $minRegionLength) {  ### if it is long enough, log it
		push(@mergedCoordinates, { 'Start' => $currentRegionStart,
 								   'Len' => $currentRegionLength,
								   'pValue' => $currentRegionPvalue/$currentRegionLength})
	}

	return (@mergedCoordinates);
}

#################################################################################################################
#
# Convert a list of coordiantes with pvalues (from bed file) to an map array of pvalues
#

sub coordinateListToMap {
	my ($coordianteListRef, $mapLength, $coordinateShift) = @_;
	my @coordianteList = @$coordianteListRef;
	my @map = (0) x $mapLength;
	foreach my $coord (@coordianteList) {
		$map[$coord->{'Start'} + $coordinateShift] = ($coord->{'pValue'} + 0);
	}
	return @map;
}
#################################################################################################################
# Load MAF file and return a FASTA array in reference coordinates
# alignmentLengthFilter is a number 0-1 indicating what is the minimum overlap (in percent) of the alignment to the reference species
#  to be consider as a real alignment.
#

sub loadMAF {
	(my $curOrtholog, my $referenceSequenceMaxLength, my $targetSequenceMaxLength, my $MSARef,my $upOrDownStreamReference, my $upOrDownStreamTarget, my $alignmentLengthFilter) = @_;
	my $mappings = new MappingDatabase;

	for my $aln (cleanAlignments($curOrtholog->getMAFFileName($upOrDownStreamTarget)) ) {

		my @individualAlignments = $aln->each_seq();
		my $referenceAlignment = $individualAlignments[0];
		my $targetAlignment = $individualAlignments[1];

		my $referenceLocus = $referenceAlignment->id();
		my $referenceSequence = $referenceAlignment->seq();
		my $referenceRelativePosition = $referenceAlignment->start();
	    my $referenceSpecies = geneToSpecies($referenceLocus);

		my $targetLocus = $targetAlignment->id();
		my $targetSequence = $targetAlignment->seq();
		my $targetRelativePosition = $targetAlignment->start();
		my $targetGenome = geneToGenome($targetLocus);
		my $targetSpecies = geneToSpecies($targetLocus);
	
		if($targetSpecies eq $referenceSpecies) { $targetSpecies = "Paralogs"; }
		
		my $targetAlignmentStrand = $targetAlignment->strand();
		if($targetAlignmentStrand == -1 ) { $targetAlignmentStrand = "-"; } else {$targetAlignmentStrand = "+"; }

		if( $upOrDownStreamTarget eq "U") {
			$targetRelativePosition -= $targetSequenceMaxLength +1;
		}
		if($upOrDownStreamReference eq "U") {
			$referenceRelativePosition -= $referenceSequenceMaxLength +1;
		}



	   	if(not defined $MSARef->{$referenceSpecies}) {
	    	$MSARef->{$referenceSpecies} = 'N' x $referenceSequenceMaxLength;;
	   	}
	   	if(not defined $MSARef->{$targetSpecies}) {
		   	$MSARef->{$targetSpecies} = 'N' x $referenceSequenceMaxLength;
	   	}
	
		my $referenceSequenceLength = length($referenceSequence);
		#### Log the alignment, if it is at the right length

		if($alignmentLengthFilter==0 || $referenceSequenceLength >= $alignmentLengthFilter*$referenceSequenceMaxLength ) {
			# Merge the aligned sequences
			my $mapping = new Mapping($referenceLocus, geneToSpecies($targetLocus), $targetLocus, $targetRelativePosition, $targetAlignmentStrand, $referenceRelativePosition, $targetSequence, $referenceSequence);
			$mappings->add($mapping);
			$MSARef->{$targetSpecies} = mergeFastaSequences($MSARef->{$targetSpecies}, $mapping->getCNSSeq(), $referenceAlignment->start()-1);

		} else {
			if($verbose) {
				print "PROGRESS: Rejected alignment to $referenceLocus because length is too short ($referenceSequenceLength). CNS length: " . $referenceSequenceMaxLength . "\n";
			}
		}
	}
	return $mappings;
}


sub loadMAFforCNSs {
	my ($genomeDB, $CNSDB, $ortholog  ) = @_;
	my $mappings = new MappingDatabase;
	my @upOrDownDictionary;
	my @alignments = cleanAlignments($ortholog->getMAFFileName("U"));
	push @upOrDownDictionary, ("U") x (scalar @alignments);
	push @alignments, cleanAlignments($ortholog->getMAFFileName("D"));
	push @upOrDownDictionary, ("D") x (scalar @alignments);

	foreach my $curAlnIndex (0.. scalar @alignments -1) {
		my $aln = $alignments[$curAlnIndex];
		my @individualAlignments = $aln->each_seq();
		my $referenceAlignment = $individualAlignments[0];
		my $targetAlignment = $individualAlignments[1];

		my $referenceLocus = $referenceAlignment->id();
		my $referenceSequence = $referenceAlignment->seq();
		my $referenceRelativePosition = $referenceAlignment->start();
	    my $referenceSpecies = geneToSpecies($referenceLocus);

		my $targetLocus = $targetAlignment->id();
		my $targetSequence = $targetAlignment->seq();
		my $targetRelativePosition = $targetAlignment->start() ;
		my $targetGenome = geneToGenome($targetLocus);
		my $targetSpecies = geneToSpecies($targetLocus);

		if( $upOrDownDictionary[$curAlnIndex] eq "U") {
			$targetRelativePosition -= $genomeDB->getUpstreamLength($targetGenome) +1;
		}

		my $targetAlignmentStrand = $targetAlignment->strand();
		if($targetAlignmentStrand == -1 ) { $targetAlignmentStrand = "-"; } else {$targetAlignmentStrand = "+"; }

		if($CNSDB->exists($referenceLocus) && (($upOrDownDictionary[$curAlnIndex] eq "U" && $CNSDB->getCNSByID($referenceLocus)->getPos() < 0) || ($upOrDownDictionary[$curAlnIndex] eq "D" && $CNSDB->getCNSByID($referenceLocus)->getPos() >= 0) )) {
			my $mapping = new Mapping($referenceLocus, $targetSpecies, $targetLocus, $targetRelativePosition, $targetAlignmentStrand, $referenceRelativePosition, $targetSequence, $referenceSequence);
			$mappings->add($mapping);
		}
	}

	return $mappings;
}

### Takes a MAF files, reads the alignments and returns a set of non-overlapping SimpleAlign objects

sub cleanAlignments {
	my ($mafFileName) = @_;
	my @cleanAlignments;

	my $maffile = Bio::AlignIO->new(-file => $mafFileName,
					-format => "maf") || die ("");

	# First clean up the MAF file. If we have multiple alignments to the same place, pick just the best one
	my $overlappingSource;
	while(my $curAln = $maffile->next_aln) {
		#check to see if we overlap something
		my $source = $curAln->get_seq_by_pos(1);
		my $target = $curAln->get_seq_by_pos(2);
		$overlappingSource=0;
		for my $curAlignmentNum (0..(@cleanAlignments-1)) {
			my $scannedAln = $cleanAlignments[$curAlignmentNum];
			my $scanSource = $scannedAln->get_seq_by_pos(1);
			my $scanTarget = $scannedAln->get_seq_by_pos(2);
			if(($target->id() eq $scanTarget->id()) && 
			( overlapFragment($scanSource->start, $scanSource->end, $source->start, $source->end) > $alignmentSeqOverlapToMerge ||
			 overlapFragment($source->start, $source->end, $scanSource->start, $scanSource->end) > $alignmentSeqOverlapToMerge ) ) {
				## If the hits are not coming from the same region, pick the best one
				$overlappingSource=1;
				if($curAln->score > $scannedAln->score) {
					$cleanAlignments[$curAlignmentNum] = $curAln;
				}
			}
		}
		if(!$overlappingSource) {
			push @cleanAlignments, $curAln;
		}
	}
	return @cleanAlignments;
}

